{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/XuyangAbert/EUFSFC/blob/master/example_eufsfc.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone \"https://github.com/XuyangAbert/EUFSFC.git\""
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EhUIaIZA6vt7",
        "outputId": "8eb6c2a2-4d21-48f6-9960-34d3c8ba5b2d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'EUFSFC'...\n",
            "remote: Enumerating objects: 84, done.\u001b[K\n",
            "remote: Counting objects: 100% (84/84), done.\u001b[K\n",
            "remote: Compressing objects: 100% (82/82), done.\u001b[K\n",
            "remote: Total 84 (delta 46), reused 0 (delta 0), pack-reused 0\u001b[K\n",
            "Receiving objects: 100% (84/84), 1.31 MiB | 3.24 MiB/s, done.\n",
            "Resolving deltas: 100% (46/46), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import math\n",
        "import pandas as pd\n",
        "import numpy.matlib as b\n",
        "from sklearn.preprocessing import normalize\n",
        "import time\n",
        "from EUFSFC.entropy_estimators import *"
      ],
      "metadata": {
        "id": "SusR9Ojd42Ez"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class data_load(object):\n",
        "  def __init__(self, path, file):\n",
        "    self.df = pd.read_csv(path+file, header = None)\n",
        "    [N, L] = np.shape(self.df)\n",
        "    self.dim = L - 1\n",
        "    self.labels = self.df.iloc[:, L-1].values\n",
        "    self.data = self.df.iloc[:, 0:self.dim].values\n",
        "\n",
        "  def preprocess(self):\n",
        "    [N,L] = np.shape(self.data)\n",
        "    NewData = np.zeros((N,L))\n",
        "    for i in range(L):\n",
        "        Temp = self.data[:,i]\n",
        "        if max(Temp)==0:\n",
        "            NewData[:,i] = np.zeros((N,1))\n",
        "        else:\n",
        "            Temp = (Temp - np.min(Temp))/(max(Temp)-min(Temp))\n",
        "            NewData[:,i] = Temp\n",
        "    return NewData\n",
        "\n",
        "  def process(self):\n",
        "    self.normalized_data = self.preprocess()\n",
        "    return self.normalized_data, self.labels, self.data"
      ],
      "metadata": {
        "id": "CDxh9G_w49h8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class feature_similarity(object):\n",
        "  def __init__(self):\n",
        "    pass\n",
        "  def distribution_est(self, data, dim):\n",
        "    DC_mean = np.zeros(dim)\n",
        "    DC_std = np.zeros(dim)\n",
        "    for i in range(dim):\n",
        "        TempClass = data[:,i]\n",
        "        DC_mean[i] = np.mean(TempClass)\n",
        "        DC_std[i] = np.std(TempClass)\n",
        "    return DC_mean,DC_std\n",
        "\n",
        "  def feature_dist_cont(self, DC_means, DC_std, data, Var, dim, Corr):\n",
        "    DisC = np.zeros((dim,dim))\n",
        "    Dist = []\n",
        "    for i in range(dim):\n",
        "        for j in range(i,dim):\n",
        "            DisC[i,j] = self.kld_cal(data,i,j,Var,Corr)\n",
        "            DisC[j,i] = DisC[i,j]\n",
        "            Dist.append(DisC[i,j])\n",
        "    return DisC,Dist\n",
        "\n",
        "  def feature_dist_disc(self, data, dim):\n",
        "    DisC = np.zeros((dim,dim))\n",
        "    Dist = []\n",
        "    for i in range(dim):\n",
        "      if len(np.unique(data[:,i])) == 1:\n",
        "        DisC[i,:] = 1\n",
        "        DisC[:,i] = 1\n",
        "        Dist.append(1)\n",
        "        continue\n",
        "      for j in range(i,dim):\n",
        "          DisC[i,j] = self.sym_cal(data,i,j)\n",
        "          DisC[j,i] = DisC[i,j]\n",
        "          Dist.append(DisC[i,j])\n",
        "    return DisC,Dist\n",
        "\n",
        "  def kld_cal(self, data,i,j,Var,Corr):\n",
        "    Var1 = Var[i]\n",
        "    Var2 = Var[j]\n",
        "    P = Corr[i,j]\n",
        "    Sim = Var1 + Var2 - ((Var1 + Var2)**2 - 4 * Var1 * Var2 * (1 - P**2))**0.5\n",
        "    D_KL = Sim / (Var1 + Var2)\n",
        "    return D_KL\n",
        "\n",
        "  def sym_cal(self, data,i,j):\n",
        "    I_ij = midd(data[:,i],data[:,j])\n",
        "    H_I = entropyd(data[:,i])\n",
        "    H_J = entropyd(data[:,j])\n",
        "    D_KL = 1 - 2*(I_ij)/(H_I + H_J)\n",
        "    return D_KL"
      ],
      "metadata": {
        "id": "S1fFaNHX8Fkv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def fitness_cal(DisC, DC_means, DC_std, data, StdF, gamma):\n",
        "    fitness = np.zeros(len(DC_means))\n",
        "    # print(np.shape(fitness))\n",
        "    for i in range(len(DC_means)):\n",
        "        TempSum = 0\n",
        "        for j in range(len(DC_means)):\n",
        "            if j != i:\n",
        "                D = DisC[i,j]\n",
        "                TempSum = TempSum + (math.exp(- (D**2) / StdF))**gamma\n",
        "        fitness[i] = TempSum\n",
        "    return fitness"
      ],
      "metadata": {
        "id": "m_oeBIN18Jhz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class feature_clustering(object):\n",
        "  def __init__(self):\n",
        "    pass\n",
        "  def pseduo_peaks(self, disc, dist, dc_mean, dc_std, data, fitness, stdf, gamma, var1):\n",
        "      # The temporal sample space in terms of mean and standard deviation\n",
        "      sample = np.vstack((dc_mean,dc_std)).T\n",
        "      # Search Stage of Pseduo Clusters at the temporal sample space\n",
        "      neirad = 0.15*max(dist) # 0.15\n",
        "      i = 0\n",
        "      marked = []\n",
        "      c_indices = np.arange(1, len(dc_mean)+1) # The pseduo Cluster label of features\n",
        "      peakindices = []\n",
        "      pfitness = []\n",
        "      co = []\n",
        "      fitn = fitness\n",
        "      while True:\n",
        "          peakindices.append(np.argmax(fitn))\n",
        "          pfitness.append(np.max(fitn))\n",
        "          indices = self.neighborsearch(disc, data,\n",
        "                                        sample, peakindices[i],\n",
        "                                        marked, neirad, var1)\n",
        "          c_indices[indices] = peakindices[i]\n",
        "          if len(indices) == 0:\n",
        "              indices=[peakindices[i]]\n",
        "          co.append(len(indices)) # Number of samples belong to the current\n",
        "          # identified pseduo cluster\n",
        "          marked = np.concatenate(([marked,indices]))\n",
        "          # Fitness Proportionate Sharing\n",
        "          fitn = self.sharing(fitn, indices)\n",
        "          # Check whether all of samples has been assigned a pseduo cluster label\n",
        "          if np.sum(co) >= (len(fitn)):\n",
        "              break\n",
        "          i=i+1 # Expand the size of the pseduo cluster set by 1\n",
        "      c_indices = self.close_fcluster(peakindices, disc, np.shape(disc)[0])\n",
        "      return peakindices, pfitness, c_indices\n",
        "\n",
        "  def neighborsearch(self, disC, data, sample, p_indice, marked, radius, var1):\n",
        "      cluster = []\n",
        "      for i in range(np.shape(sample)[0]):\n",
        "          if i not in marked:\n",
        "              dist = disC[i, p_indice]\n",
        "              if dist <= radius:\n",
        "                  cluster.append(i)\n",
        "      indices = cluster\n",
        "      return indices\n",
        "\n",
        "  def close_fcluster(self, fcluster, disc, dim):\n",
        "      f_indices = np.arange(dim)\n",
        "      for i in range(dim):\n",
        "          dist_fcluster = disc[i, fcluster]\n",
        "          f_indices[i] = fcluster[np.argmin(dist_fcluster)]\n",
        "      return f_indices\n",
        "\n",
        "  def sharing(self, fitness, indices):\n",
        "      newfitness = fitness\n",
        "      sum1 = 0\n",
        "      for j in range(len(indices)):\n",
        "          sum1 = sum1 + fitness[indices[j]]\n",
        "      for th in range(len(indices)):\n",
        "              newfitness[indices[th]] = fitness[indices[th]] / (1+sum1)\n",
        "      return newfitness\n",
        "\n",
        "  def pseduo_evolve(self, disc, peakindices, pseduof, c_indices,\n",
        "                    dc_mean, dc_std, data, fitness, stdf, gamma):\n",
        "      # Initialize the indices of Historical Pseduo Clusters and their fitness values\n",
        "      histcluster = peakindices\n",
        "      histclusterf = pseduof\n",
        "      while True:\n",
        "          # Call the merge function in each iteration\n",
        "          [cluster, cfitness, f_indices] = self.pseduo_merge(disc, histcluster,\n",
        "                                                             histclusterf, c_indices,\n",
        "                                                             dc_mean, dc_std, data,\n",
        "                                                             fitness, stdf, gamma)\n",
        "          # Check for the stablization of clutser evolution and exit the loop\n",
        "          if len(np.unique(cluster)) == len(np.unique(histcluster)):\n",
        "              break\n",
        "          # Update the feature indices of historical pseduo feature clusters and\n",
        "          # their corresponding fitness values\n",
        "          histcluster = cluster\n",
        "          histclusterf = cfitness\n",
        "          c_indices = f_indices\n",
        "      # Compute final evolved feature cluster information\n",
        "      fcluster = np.unique(cluster)\n",
        "      ffitness = cfitness\n",
        "      c_indices = f_indices\n",
        "\n",
        "      return fcluster, ffitness, c_indices\n",
        "\n",
        "  def pseduo_merge(self, disc, peakindices, pseduof, c_indices,\n",
        "                     dc_mean, dc_std, data, fitness, stdf, gamma):\n",
        "      # Initialize the pseduo feature clusters lables for all features\n",
        "      f_indices = c_indices\n",
        "      # Initialize the temporal sample space for feature means and stds\n",
        "      sample = np.vstack((dc_mean,dc_std)).T\n",
        "      ml = [] # Initialize the merge list as empty\n",
        "      marked = [] #List of checked Pseduo Clusters Indices\n",
        "      unmarked = [] # List of unmerged Pseduo Clusters Indices\n",
        "      for i in range(len(peakindices)):\n",
        "              M = 1 # Set the merge flag as default zero\n",
        "              mindist = math.inf # Set the default Minimum distance between two feature clusters as infinite\n",
        "              minindice = 0 # Set the default Neighboring feature cluster indices as zero\n",
        "              # Check the current Pseduo Feature Cluster has been evaluated or not\n",
        "              if peakindices[i] not in marked:\n",
        "                  for j in range(len(peakindices)):\n",
        "                          if j != i:\n",
        "                              # Divergence Calculation between two pseduo feature clusters\n",
        "                              d = disc[peakindices[i], peakindices[j]]\n",
        "                              if mindist > d:\n",
        "                                  mindist = d\n",
        "                                  minindice = j\n",
        "                  if minindice != 0:\n",
        "                      # Current feature pseduo cluster under check\n",
        "                      current = sample[peakindices[i],:]\n",
        "                      currentfit = pseduof[i]\n",
        "                      # Neighboring feature pseduo cluster of the current checked cluster\n",
        "                      neighbor = sample[peakindices[minindice],:]\n",
        "                      neighborfit = pseduof[minindice]\n",
        "\n",
        "                      # A function to identify the bounady feature instance between two\n",
        "                      # neighboring pseduo feature clusters\n",
        "                      bp = self.boundary_points(disc, f_indices, data,\n",
        "                                                peakindices[i],\n",
        "                                                peakindices[minindice])\n",
        "                      bpf=fitness[bp]\n",
        "                      if bpf < 0.85*min(currentfit, neighborfit):\n",
        "                          M = 0 # Change the Merge flag\n",
        "                      if M == 1:\n",
        "                          ml.append([peakindices[i], peakindices[minindice]])\n",
        "                          marked.append(peakindices[i])\n",
        "                          marked.append(peakindices[minindice])\n",
        "                      else:\n",
        "                          unmarked.append(peakindices[i])\n",
        "      newpi = []\n",
        "      # Update the pseduo feature clusters list with the obtained mergelist\n",
        "      for m in range(np.shape(ml)[0]):\n",
        "          if fitness[ml[m][0]] > fitness[ml[m][1]]:\n",
        "              newpi.append(ml[m][0])\n",
        "              f_indices[c_indices == ml[m][1]] = ml[m][0]\n",
        "          else:\n",
        "              newpi.append(ml[m][1])\n",
        "              f_indices[c_indices == ml[m][0]] = ml[m][1]\n",
        "      # Update the pseduo feature clusters list with pseduo clusters that have not appeared in the merge list\n",
        "      for n in range(len(peakindices)):\n",
        "          if peakindices[n] in unmarked:\n",
        "              newpi.append(peakindices[n])\n",
        "      # Updated pseduo feature clusters information after merging\n",
        "      fcluster = np.unique(newpi)\n",
        "      ffitness = fitness[fcluster]\n",
        "      f_indices = self.close_fcluster(fcluster, disc, np.shape(disc)[0])\n",
        "      return fcluster, ffitness, f_indices\n",
        "\n",
        "  def boundary_points(self, disc, f_indices, data, current, neighbor):\n",
        "      [N, dim] = np.shape(data)\n",
        "      tempcluster1 = np.where(f_indices == current)\n",
        "      tempcluster2 = np.where(f_indices == neighbor)\n",
        "      tempcluster = np.append(tempcluster1, tempcluster2)\n",
        "      d = []\n",
        "      for i in range(len(tempcluster)):\n",
        "          d1 = disc[tempcluster[i], current]\n",
        "          d2 = disc[tempcluster[i], neighbor]\n",
        "          d.append(abs(d1 - d2))\n",
        "      if not d:\n",
        "          bd = current\n",
        "      else:\n",
        "          fi = np.argmin(d)\n",
        "          bd = tempcluster[fi]\n",
        "      return bd\n",
        "\n",
        "  def pseduogeneration(self, psep, n):\n",
        "      pse_mean = psep[:,0]\n",
        "      pse_std = psep[:,1]\n",
        "      data = np.zeros((n, len(pse_mean)))\n",
        "      for i in range(len(pse_mean)):\n",
        "          data[:, i] = (np.repeat(pse_mean[i], n) + pse_std[i] * np.random.randn(n)).T\n",
        "      return data\n",
        "\n",
        "  def psefitness_cal(psep, sample, data, pseduodata, stdf, gamma):\n",
        "      orifn = np.shape(sample)[0]\n",
        "      pn = np.shape(psep)[0]\n",
        "      psepf = np.zeros(pn)\n",
        "      for i in range(pn):\n",
        "          tempsum = 0\n",
        "          for j in range(orifn):\n",
        "              var1 = np.var(data[:,j])\n",
        "              var2 = np.var(pseduodata[:,i])\n",
        "              p = np.corrcoef(data[:,j], pseduodata[:,i])[0,1]\n",
        "              sim = var1 + var2 - ((var1 + var2)**2 - 4 * var1 * var2 * (1 - p**2))**0.5\n",
        "              d_kl = sim / (var1 + var2)\n",
        "              tempsum = tempsum + (math.exp(-(d_kl**2)/stdf))**gamma\n",
        "          psepf[i] = tempsum\n",
        "      return psepf"
      ],
      "metadata": {
        "id": "u_ufqIYe8Aku"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gGf19cmz4sOP"
      },
      "outputs": [],
      "source": [
        "def main(path, file, feature_type):\n",
        "  start = time.time()\n",
        "  data_loader = data_load(path, file)\n",
        "  [data, label, OriData] = data_loader.process()\n",
        "  [N, dim] = np.shape(data)\n",
        "  fs_calculator = feature_similarity()\n",
        "  [DC_means, DC_std] = fs_calculator.distribution_est(data,dim)\n",
        "  Var = np.var(data,axis=0)\n",
        "  Corr = np.corrcoef(data.T)\n",
        "  if feature_type == 'continuous':\n",
        "    DisC,Dist = fs_calculator.feature_dist_cont(DC_means,DC_std,data,Var,dim,Corr)\n",
        "  else:\n",
        "    DisC,Dist = fs_calculator.feature_dist_disc(data, dim)\n",
        "  end1 = time.time()\n",
        "  print('Distance Calculation Finished:',end1-start)\n",
        "  StdF = (np.max(np.power(Dist,0.5)))**2\n",
        "  gamma = 5\n",
        "  fitness = fitness_cal(DisC, DC_means, DC_std, data, StdF, gamma)\n",
        "  oldfitness = np.copy(fitness)\n",
        "  fc_model = feature_clustering()\n",
        "  [PeakIndices,Pfitness,C_Indices] = fc_model.pseduo_peaks(DisC, Dist, DC_means,\n",
        "                                                           DC_std, data,fitness,\n",
        "                                                           StdF,gamma, Var)\n",
        "  fitness = oldfitness\n",
        "  # Pseduo Clusters Infomormation Extraction\n",
        "  PseDuo = DC_means[PeakIndices] # Pseduo Feature Cluster centers\n",
        "  PseDuoF = Pfitness # Pseduo Feature Clusters fitness values\n",
        "  end2 = time.time()\n",
        "  print('Initial Clustering Search Finished:',end2-start)\n",
        "  #-------------Check for possible merges among pseduo clusters-----------#\n",
        "  [FCluster,Ffitness,C_Indices] = fc_model.pseduo_evolve(DisC, PeakIndices, PseDuoF, C_Indices, DC_means, DC_std, data, fitness, StdF, gamma)\n",
        "  SF = FCluster\n",
        "  Extract_FIndices = SF\n",
        "  label = label.reshape(N,1)\n",
        "  Extract_Data = np.concatenate((OriData[:,SF],label),axis=1)\n",
        "  end2 = time.time()\n",
        "  print('The total time in seconds:',end2-start)\n",
        "  return SF, Extract_Data"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# path = '/content/EUFSFC/'\n",
        "# files = 'Prostate-GE.csv'\n",
        "# ftype = 'continuous'\n",
        "path = '/content/EUFSFC/'\n",
        "files = 'colon.csv'\n",
        "ftype = 'discrete'\n",
        "#--------------------------------------------------------------------------------------------------------------\n",
        "if __name__ == '__main__':\n",
        "  fsub, extract_data = main(path, files, ftype)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vJA10ZgKBFQS",
        "outputId": "ee32c689-eda2-4f01-b493-93a6674c0c34"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Distance Calculation Finished: 278.91725397109985\n",
            "Initial Clustering Search Finished: 304.0172927379608\n",
            "The total time in seconds: 306.9641764163971\n"
          ]
        }
      ]
    }
  ]
}